ETL is a type of data integration that refers to the three steps (extract, transform, load) used to blend data from multiple sources. It's often used to build a data warehouse. During this process, data is taken (extracted) from a source system, converted (transformed) into a format that can be analyzed, and stored (loaded) into a data warehouse or other system. Extract, load, transform (ELT) is an alternate but related approach designed to push processing down to the database for improved performance.
For the 3 questions, I decided to focus exclusively on American superheroes and movies.
Is there a gender bias in superhero movie industry?
What are the most important variables in predicting movie's profitability.
For production companies that were acquired, how much on average has each superhero earned?
Data sources: • Superheroes API • Web Scraping • Movies API • Python package - IMDbPY
Data formats: • CSV • XLSX • pgAdmin • HTML
• cleaning • joining • filtering • aggregating (group by, sort, average, count) • deleting columns • data visualization
load the final database into the PostgreSQL database server.
render images for each movie, superhero and a short movie summary from the database.
# import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress
from pprint import pprint
import json
# PART 1 =====================================
# First sorce of data comes from superhero API.
# requests library helps us get the content from the API by using the get() method.
url = "https://superhero-search.p.rapidapi.com/"
querystring = {"hero":"spiderman"}
headers = {
'x-rapidapi-host': "superhero-search.p.rapidapi.com",
'x-rapidapi-key': "5a8ea71761msh2e12845540e03c1p1bacdfjsn766f6824866c"
}
response = requests.request("GET", url, headers=headers, params=querystring)
print(response.text)
# Python has a built-in package called json, which can be used to work with JSON data we get from the API.
response = requests.get(url, headers=headers, params=querystring).json()
print(json.dumps(response, indent=4, sort_keys=True))
# set up lists to hold reponse info
hero_id = []
name = []
fullName = []
gender = []
eyeColor = []
hairColor = []
height = []
weight = []
race = []
placeOfBirth = []
work = []
images = []
publisher = []
intelligence = []
power = []
firstAppearance = []
count = 0
count_error = 0
# Loop through the list of 561 heroes and perform a request for each hero
for i in range(561):
url = "https://superhero-search.p.rapidapi.com/"
querystring = {"id": i}
headers = {
'x-rapidapi-host': "superhero-search.p.rapidapi.com",
'x-rapidapi-key': "5a8ea71761msh2e12845540e03c1p1bacdfjsn766f6824866c"
}
response = requests.get(url, headers=headers, params=querystring).json()
# print(json.dumps(response, indent=4, sort_keys=True))
try:
# pprint(response)
# print('==============')
id_v = (response['id'])
name_v = (response['name'])
fullName_v = (response['biography']['fullName'])
gender_v = response['appearance']['gender']
eyeColor_v = response['appearance']['eyeColor']
hairColor_v = response['appearance']['hairColor']
height_v = response['appearance']['height'][0]
weight_v = response['appearance']['weight'][0]
race_v = response['appearance']['race']
placeOfBirth_v = (response['biography']['placeOfBirth'])
work_v = response['work']['occupation']
images_v = response['images']['lg']
publisher_v = (response['biography']['publisher'])
firstAppearance_v = response['biography']['firstAppearance']
intelligence_v = response['powerstats']['intelligence']
power_v = response['powerstats']['power']
hero_id.append(id_v)
name.append(name_v)
fullName.append(fullName_v)
gender.append(gender_v)
eyeColor.append(eyeColor_v)
hairColor.append(fullName_v)
height.append(height_v)
weight.append(weight_v)
race.append(race_v)
placeOfBirth.append(placeOfBirth_v)
work.append(work_v)
images.append(images_v)
publisher.append(publisher_v)
intelligence.append(intelligence_v)
power.append(power_v)
firstAppearance.append(firstAppearance_v)
# print(count)
count +=1
except (KeyError, IndexError):
count_error +=1
print(f"Missing field/result... skipping. {count_error}")
print(f"Total results :{count} heros")
print(f"Missing field/result:{count_error} errors")
# creating a data frame for heroes
hero_dict = {
"hero_id" : hero_id,
"name" : name,
"fullName":fullName,
"gender":gender,
"eyeColor" :eyeColor,
"hairColor":hairColor,
"height":height,
"weight":weight,
"race" :race,
"placeOfBirth":placeOfBirth,
"work" : work,
"images": images,
"publisher": publisher,
"intelligence":intelligence,
"power":power,
"firstAppearance":firstAppearance
}
hero_df = pd.DataFrame(hero_dict)
print(hero_df.count())
hero_df.head()
# Data cleaning for hero_df. Deleting some columns, that do not have valuable info
hero_df = hero_df.drop(['hero_id', 'hairColor', 'firstAppearance', 'publisher'], axis=1)
hero_df.head()
# Saving the dataframe to heroes.csv file
hero_df.to_csv('heroes.csv', index=False)
# Creating a test data frame from heroes.csv
test_df = pd.read_csv('heroes.csv')
# Data Cleanup & Analysis
test_df.loc[test_df['gender'] == '-', 'gender'] = 'Other'
test_df.head()
# Groupby gender and calculating avarage intelligence for each group
test_df.groupby('gender')['intelligence'].mean()
test_df.groupby('gender')['name'].count()
gender_df = pd.DataFrame()
gender_df['count'] = test_df.groupby('gender')['name'].count()
gender_df['avr intelligence'] = round(test_df.groupby('gender')['intelligence'].mean(), 2)
gender_df =gender_df.reset_index()
gender_df
import plotly.graph_objects as go
x = ['Female', 'Male', 'Other']
fig = go.Figure(data=[
go.Bar(name='count', x=x, y = gender_df['count']),
go.Bar(name='intelligence', x=x, y = gender_df['avr intelligence'])
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(
font=dict(
family="Courier New, monospace",
size=18,
color="#7f7f7f"
),
title={
'text':'SuperHeroes Counts and Intelligence by Gender',
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'}
)
fig.update_xaxes(title_text='Gender')
fig.update_yaxes(title_text='Count / Intelligence')
fig.show()
import plotly.express as px
df = test_df
fig = px.scatter(df, x="power", y="weight", facet_col="gender")
fig.update_xaxes(title_font=dict(size=18, family='Courier', color='crimson'))
fig.update_yaxes(title_font=dict(size=18, family='Courier', color='red'))
fig.update_layout(
font=dict(
family="Courier New, monospace",
size=18,
color="#7f7f7f"
),
title={
'text':'SuperHeroes Power vs Weight by Gender',
'y':1,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'}
)
fig.show()
I found that there is gender inequality in the Superheroes dataset. There are 395 male heroes, 151 female heroes and 15 other gender heroes. The average intelligence for a male hero is 65.42 points. It is higher than the average intelligence for a female hero (62.50). Clearly, there are the immediate issues of parity, equal treatment and human rights.
Looks like there is no strong correlation between the power of hero and his/her/other weight.
There is no question that watching films can help us escape our everyday lives. However, all films impact our society and popular culture. I am sure that a growing number of female superheroes with a high intelligence will empower girls and women all around the world.
# PART 2 =====================================
# Second sorce of data comes from scraping a web page with a list of best 50 Superheroes movies af all time.
from bs4 import BeautifulSoup
import requests
url = 'http://superheroes.theringer.com/?_ga=2.205407573.49282893.1583382995-1394544322.1583382995'
# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object;
# parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')
# Examine the results
# print(soup.prettify())
# results are returned as an iterable list
results = soup.find_all('li', class_='card-item')
# results[0].text
# results[0]
# Creating lists to hold the info
rank = []
title= []
date= []
rottenTomatoes= []
boxOffice= []
rewatchability= []
totalScore= []
image= []
# Loop through returned results
for result in results:
# print(result)
# Error handling
try:
# Identify and return title of the movie
title_v = result.find('span', class_="title").text
# print(title_v)
# Identify and return date of the movie
date_v = result.find('span', class_="date").text
# print(date_v)
rank_v = result.find('div', class_="rank").span.text
# print(rank_v)
rottenTomatoes_v = result.find('span', class_="data").text
# print(rottenTomatoes_v)
boxOfficeB = result.find('div', class_='expanded-data-col adjusted')
boxOffice_v = boxOfficeB.find('span', class_='data offset').text
# print(boxOffice_v)
rewatchabilityB = result.find('div', class_='expanded-data-col rewatchability')
rewatchability_v = rewatchabilityB.find('span', class_='data').text
# print(rewatchability_v)
totalScoreB = result.find('div', class_='expanded-data-col score')
totalScore_v = totalScoreB.find('span', class_='data').text
# print(totalScore_v)
# Identify and return an image link
image_v = result.find('div', class_="image")['data-src']
# print(image_v)
rank.append(rank_v)
title.append(title_v)
date.append(date_v)
rottenTomatoes.append(rottenTomatoes_v)
boxOffice.append(boxOffice_v)
rewatchability.append(rewatchability_v)
totalScore.append(totalScore_v)
image.append(image_v)
# Print results only if items are available
# if (title and date and rank and rottenTomatoes and boxOffice and rewatchability and totalScore and image):
# print('-------------')
# print(rank)
# print(title)
# print(date)
# print(rottenTomatoes)
# print(boxOffice)
# print(rewatchability)
# print(totalScore)
# print(image)
except AttributeError as e:
print(e)
# create a data frame for the best 50 movies
best_50_movies_dict = {
"rank" : rank,
"title" : title,
"date":date,
"rottenTomatoes":rottenTomatoes,
"boxOffice" :boxOffice,
"rewatchability":rewatchability,
"totalScore":totalScore,
"image":image
}
movies_df = pd.DataFrame(best_50_movies_dict)
movies_df.to_csv('movies.csv', index=False)
print(movies_df.count())
movies_df.head(50)
# Data Cleanup & Analysis
# Getting a value for the boxOffice column. Cutting '$' and 'M' : $83.3M will make 83.3
movies_df['boxOffice'] = movies_df['boxOffice'].str[1:-1].astype(float)
movies_df.head()
# This chart contains the top movies based on the cumulative worldwide box office vs rotten tomatoes ratings, and total score
import plotly.express as px
from plotly.subplots import make_subplots
x =movies_df['title']
# Create traces
fig = go.Figure()
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=x, y=movies_df['boxOffice'] + 5,
mode='lines',
name='Box Office'),
secondary_y=False,)
fig.add_trace(go.Scatter(x=x, y=movies_df['rottenTomatoes'],
mode='lines+markers',
name='Rotten Tomatoes'),
secondary_y=True,)
fig.add_trace(go.Scatter(x=x, y=movies_df['totalScore'],
mode='lines+markers',
name='Total Score'),
secondary_y=True,)
# fig.add_trace(go.Scatter(x=x, y=movies_df['rewatchability'],
# mode='lines+markers',
# name='Rewatchability'),
# secondary_y=True,)
fig.update_layout(
font=dict(
family="Courier New, monospace",
size=18,
color="#7f7f7f"
),
title={
'text':'Box office vs Movie Rating',
'y':1,
'x':0.4,
'xanchor': 'center',
'yanchor': 'top'})
# fig.update_xaxes(title_text='Movies Titles')
# Set y-axes titles
fig.update_yaxes(title_text="Box Office $M", secondary_y=False)
fig.update_yaxes(title_text="Scores", secondary_y=True)
# fig.update_yaxes(title_text='Scores')
fig.update_layout(
margin=dict(l=20, r=20, t=20, b=200)
)
fig.show()
# Getting a most profitable year
box = movies_df.groupby('date')['boxOffice'].sum()
box_df = pd.DataFrame(box)
box_df =box_df.reset_index()
box_df = box_df.sort_values(by=['boxOffice'], ascending=False)
box_df.head()
# Getting a most profitable movie
movies_df = movies_df.sort_values(by=['boxOffice'], ascending=False)
movies_df.head()
# Compare box office over the years
import plotly.express as px
df = movies_df
fig = px.scatter(df, x="date", y="boxOffice", size="boxOffice", color="boxOffice",
hover_name="title", log_x=True, size_max=60)
fig.update_xaxes(title_text='year')
fig.update_yaxes(title_text='Box Office ($million)')
fig.update_layout(
title={
'text':'Box Office Over Time',
'y':0.95,
'x':0.4,
'xanchor': 'center',
'yanchor': 'top'}
)
fig.show()
# PART 3 =====================================
# Third sorce of data comes from omdb API for more information about our 50 movies.
# In this section I am also using IMDbPY.
# IMDbPY is a Python package for retrieving and managing the data about movies and people.
# Note that the ?t= is a query param for the t-itle of the
# movie we want to search for.
url = "http://www.omdbapi.com/?t="
api_key = "&apikey=trilogy"
# Example of response for movie 'The Matrix'
response = requests.get(url + "The Matrix" + api_key)
print(response.url)
data = response.json()
pprint(data)
import imdb
movies_titles = title
print(title)
# Creating a lists to store the data
imdbID=[]
id_clean = []
Title=[]
Year=[]
Production=[]
Plot=[]
Poster=[]
Actor = []
Role1 = []
Role2 = []
Actors = []
count = 0
# Performing a GET request similar to the one we executed
for i in movies_titles:
# Error handling
try:
response = requests.get(url + i + api_key)
# print(response.url)
# Converting the response to JSON, and printing the result.
data = response.json()
print(data)
imdbID_v = data['imdbID']
id_v = imdbID_v[2:]
Title_v = data['Title']
Year_v = data['Year']
Production_v = data['Production']
Plot_v = data['Plot']
Poster_v = data['Poster']
Actors_v = data['Actors']
# Getting actor and current role
i = imdb.IMDb(accessSystem='http')
movie = i.get_movie(id_v)
m_title = movie['title']
print("=============")
print(count)
print(Poster_v)
# Get the 1st Person object in the cast list
cast = movie['cast'][0]
cast
# print(cast['name'])
Actor_v = cast['name']
# print(cast.currentRole)
if len(cast.currentRole) == 1:
Role1_v = str(cast.currentRole)
Role2_v = 'None'
elif len(cast.currentRole) == 2:
Role1_v = str(cast.currentRole[0])
Role2_v = str(cast.currentRole[1])
else:
Role1_v = 'None'
Role2_v = 'None'
# Get the 2nd Person object in the cast list
second_cast = movie['cast'][1]
second_cast
# print(second_cast['name'])
# Actor_v = second_cast['name']
# print(f"Movie title: {m_title}")
print(f"Movie title: {Title_v}")
print(m_title)
print(id_v)
# print(imdbID_v)
# print(movie['cast'])
print(Actors_v)
imdbID.append(imdbID_v)
id_clean.append(id_v)
Title.append(Title_v)
Year.append(Year_v)
Actor.append(Actor_v)
Role1.append(Role1_v)
Role2.append(Role2_v)
# print(cast.currentRole)
print(f"{Actor_v} : {Role1_v} : {Role2_v}")
print(f"Second_cast actor: {second_cast['name']} : {second_cast.currentRole}")
print("===end=====")
Production.append(Production_v)
Plot.append(Plot_v)
Poster.append(Poster_v)
count +=1
except AttributeError as e:
print(e)
# Creating a dict to hold an info
about_movie_dict = {
'imdbID':imdbID,
'id_clean': id_clean,
'Title': Title,
'Actor': Actor,
'Role1': Role1,
'Role2': Role2,
'Year': Year,
'Production': Production,
'Plot': Plot,
'Poster': Poster
}
print(len(about_movie_dict['Title']))
# Creating a data frame about_movie_df
about_movie_df = pd.DataFrame(about_movie_dict)
print(about_movie_df.count())
# Saving data to csv file
about_movie_df.to_csv('about_movie.csv', index=False)
about_movie_df.head()
# Data cleaning, checking Role1 and Role2 columns, some of the rows have Hero name and Hero full name mixed.
# Checking that all Heroes names in Role1 and Heroes full name in the Role2, if they have one.
about_movie_df.loc[about_movie_df['Role1'] == 'Bruce Wayne', 'Role1'] = 'Batman'
about_movie_df.loc[about_movie_df['Role1'] == 'Batman','Role2'] = 'Bruce Wayne'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Hellboy', 'Role1'] = 'Hellboy'
about_movie_df.loc[about_movie_df['Role1'] == 'Hellboy','Role2'] = 'Anung Un Rama'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Tony Stark', 'Role1'] = 'Iron Man'
about_movie_df.loc[about_movie_df['Role1'] == 'Iron Man','Role2'] = 'Tony Stark'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Eric', 'Role1'] = 'Crow'
about_movie_df.loc[about_movie_df['Role1'] == 'Crow','Role2'] = 'Eric Draven'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Clark Kent', 'Role1'] = 'Superman'
about_movie_df.loc[about_movie_df['Role1'] == 'Superman','Role2'] = 'Clark Kent'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Thor', 'Role1'] = 'Thor'
about_movie_df.loc[about_movie_df['Role1'] == 'Thor','Role2'] = 'Thor Odinson'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Shazam', 'Role1'] = 'Captain Marvel'
about_movie_df.loc[about_movie_df['Title'] == 'Shazam!','Role2'] = 'Billy Batson'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Logan', 'Role1'] = 'Wolverine'
about_movie_df.loc[about_movie_df['Role1'] == 'Wolverine','Role2'] = 'Logan'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Scott Lang', 'Role1'] = 'Ant-Man'
about_movie_df.loc[about_movie_df['Role1'] == 'Ant-Man','Role2'] = 'Hank Pym'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Charles Xavier', 'Role1'] = 'Professor X'
about_movie_df.loc[about_movie_df['Role1'] == 'Professor X','Role2'] = 'Charles Francis Xavier'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Arthur', 'Role1'] = 'Aquaman'
about_movie_df.loc[about_movie_df['Role1'] == 'Aquaman','Role2'] = 'Orin'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Dr. Stephen Strange', 'Role1'] = 'Doctor Strange'
about_movie_df.loc[about_movie_df['Role1'] == 'Doctor Strange','Role2'] = 'Stephen Strange'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Miles Morales', 'Role1'] = 'Spider-Man'
about_movie_df.loc[about_movie_df['Role1'] == 'Spider-Man','Role2'] = 'Peter Parker'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Lex Luthor', 'Role1'] = 'Lex Luthor'
about_movie_df.loc[about_movie_df['Role1'] == 'Lex Luthor','Role2'] = 'Lex Luthor'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Professor Charles Xavier', 'Role1'] = 'Professor X'
about_movie_df.loc[about_movie_df['Role1'] == 'Professor X','Role2'] = 'Charles Francis Xavier'
about_movie_df
about_movie_df.loc[about_movie_df['Actor'] == 'Ryan Reynolds', 'Role1'] = 'Juggernaut'
about_movie_df.loc[about_movie_df['Role1'] == 'Juggernaut','Role2'] = 'Cain Marko'
about_movie_df
about_movie_df.loc[about_movie_df['Actor'] == 'Brie Larson', 'Role1'] = 'Captain Marvel'
about_movie_df.loc[about_movie_df['Actor'] == 'Brie Larson','Role2'] = 'Carol Danvers'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Steve Rogers', 'Role1'] = 'Captain America'
about_movie_df.loc[about_movie_df['Role1'] == 'Captain America','Role2'] = 'Steve Rogers'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Peter Quill', 'Role1'] = 'Star-Lord'
about_movie_df.loc[about_movie_df['Role1'] == 'Star-Lord','Role2'] = 'Peter Jason Quill'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Peter Parker', 'Role1'] = 'Spider-Man'
about_movie_df.loc[about_movie_df['Role1'] == 'Spider-Man','Role2'] = 'Peter Parker'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == 'Diana', 'Role1'] = 'Wonder Woman'
about_movie_df.loc[about_movie_df['Role1'] == 'Wonder Woman','Role2'] = 'Diana Prince'
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == "Jor-El", 'Role1'] = 'Jor-El'
about_movie_df.loc[about_movie_df['Role1'] == 'Jor-El','Role2'] = "Jor-El"
about_movie_df
about_movie_df.loc[about_movie_df['Role1'] == "T'Challa", 'Role1'] = 'Black Panther'
about_movie_df.loc[about_movie_df['Role1'] == 'Black Panther','Role2'] = "T'Challa"
about_movie_df
about_movie_df.loc[about_movie_df['Title'] == "Spider-Man: Far from Home", 'Title'] = 'Spider-Man: Far From Home'
about_movie_df.to_csv('about_movie.csv', index=False)
about_movie_df
# Adding a missing Hero - the Crow
the_crow_dict = {
"name" : ['Crow'],
"fullName":['Eric Draven'],
"gender":['Male'],
"eyeColor" :['Blue'],
"height":['avr'],
"weight":['avr'],
"race" :['human'],
"placeOfBirth": ['Earth'],
"work" : ['a rock musician'],
"images": ['https://upload.wikimedia.org/wikipedia/en/3/39/Crow_ver2.jpg'],
"intelligence":['high'],
"power":['high']
}
the_crow_df = pd.DataFrame(the_crow_dict)
print(the_crow_df.count())
the_crow_df.head()
# Appending the Crow info to the new_hero_df
new_hero_df = hero_df.append(the_crow_df)
# Checking, that a Crow hero is in dataset
new_hero_df.loc[new_hero_df['name'] == 'Crow']
# new_hero_df
# Adding a missing Hero - the Jor-El
jor_el_dict = {
"name" : ['Jor-El'],
"fullName":['Jor-El'],
"gender":['Male'],
"eyeColor" :['Blue'],
"height":['avr'],
"weight":['avr'],
"race" :['human'],
"placeOfBirth": ['Krypton,'],
"work" : ['a scientist'],
"images": ['https://vignette.wikia.nocookie.net/supermanrebirth/images/3/3d/Marlon_Brando_Jor-El.jpg/revision/latest?cb=20130224095531'],
"intelligence":['high'],
"power":['high']
}
jor_el_df = pd.DataFrame(jor_el_dict)
print(jor_el_df.count())
jor_el_df.head()
# Adding missing hero Jor-El to new_hero_df data set
new_hero_df = new_hero_df.append(jor_el_df)
# new_df.loc[new_df['name'] == 'Jor-El']
new_hero_df.reset_index(drop = True)
# Checking wich Heroes appeared the most in the top 50 SuperHero Movies
group_hero = about_movie_df.groupby(['Role1', 'Role2'])['Role1'].count()
group_hero_df = pd.DataFrame(group_hero)
group_hero_df = group_hero_df.rename(columns={'Role1':'Count'})
group_hero_df = group_hero_df.sort_values(by=['Count'], ascending=False)
group_hero_df
# Iron Man and Spider-Man appeared the most.
# Joining about_movie_df and movies_df on Title and title
# Group by Hero, to see wich Hero brings the best revenue.
movie_actor_role_df = pd.merge(left=about_movie_df, right=movies_df, left_on='Title', right_on='title')
movie_actor_role_df
# Delete duplicate columns
movie_actor_role_df = movie_actor_role_df.drop(['id_clean', 'date', 'rewatchability', 'rottenTomatoes', 'totalScore', 'image'] , axis=1)
movie_actor_role_df.head()
# Group by hero to find the top box office
group_hero_revenue = movie_actor_role_df.groupby('Role1')['boxOffice'].sum()
group_hero_revenue_df = pd.DataFrame(group_hero_revenue)
group_hero_revenue_df = group_hero_revenue_df.sort_values(by=['boxOffice'], ascending=False)
group_hero_revenue_df= group_hero_revenue_df.reset_index()
group_hero_revenue_df
# Iron Man is the most valuable hero.
# Group by production to find the most valuable company
group_production_revenue = movie_actor_role_df.groupby('Production')['boxOffice'].sum()
group_production_revenue = pd.DataFrame(group_production_revenue)
group_production_revenue = group_production_revenue.sort_values(by=['boxOffice'], ascending=False)
group_production_revenue= group_production_revenue.reset_index()
group_production_revenue
# Walt Disney Pictures is the most profitable company
#Bar chart for each Hero and revenue
import plotly.express as px
df = group_hero_revenue_df
fig = px.bar(df, x='Role1', y='boxOffice')
fig.update_xaxes(title_text='Hero')
fig.update_yaxes(title_text='Box Office ($million)')
fig.update_layout(
title={
'text':'Box Office per Hero',
'y':0.95,
'x':0.4,
'xanchor': 'center',
'yanchor': 'top'}
)
fig.show()
# Checking wich production company appeared the most in the top 50 SuperHero Movies
group_production = about_movie_df.groupby('Production')['Production'].count()
group_production_df = pd.DataFrame(group_production)
group_production_df = group_production_df.rename(columns={'Production':'Count'})
group_production_df = group_production_df.sort_values(by=['Count'], ascending=False)
group_production_df
#Bar chart for each production studio vs revenue
import plotly.express as px
df = group_production_revenue
fig = px.bar(df, x='Production', y='boxOffice')
fig.update_xaxes(title_text='Production Studio')
fig.update_yaxes(title_text='Box Office ($million)')
fig.update_layout(
title={
'text':'Box Office per Production Studio',
'y':0.95,
'x':0.4,
'xanchor': 'center',
'yanchor': 'top'}
)
fig.show(renderer="notebook")
# PART 4 =========================== Final ======================
# Joining movie_actor_role_df with new_hero_df on Role1 and name.
# It will create final dataset, which will have info about the best 50 Superheroes movies, main actor for each movie
# and the main hero for each movie.
movie_actor_hero_df = pd.merge(left=movie_actor_role_df, right=new_hero_df, left_on=['Role1', 'Role2'], right_on=['name', 'fullName'])
movie_actor_hero_df
# Cleaning data for movie_actor_hero_df
movie_actor_hero_df = movie_actor_hero_df.drop(['imdbID','title', 'Role1' , 'Role2', 'Production', 'eyeColor', 'height', 'weight', 'race', 'intelligence', 'power'], axis = 1)
movie_actor_hero_df.head(1)
# Renaming columns
movie_actor_hero_df = movie_actor_hero_df.rename(columns={"rank": "Movie_Rank",
"boxOffice": "Box_Office",
"name": "Hero_Name",
"fullName": "Hero_Full_Name",
"gender": "Hero_Genger",
"placeOfBirth": "Hero_Place_Of_Birth",
"work": "Hero_work",
"images": "Hero_Images"
})
movie_actor_hero_df
# saving the dataframe to movie_actor_hero.csv file
movie_actor_hero_df.to_csv('movie_actor_hero.csv', index=False)
movie_actor_hero_df.dtypes
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
import psycopg2
# Define movie_actor_hero table
class movie_actor_hero(Base):
__tablename__ = 'movie_actor_hero'
id = Column(Integer, primary_key=True)
Title=Column(String)
Actor=Column(String)
Year=Column(String)
Plot=Column(String)
Poster=Column(String)
Movie_Rank=Column(String)
Box_Office =Column(Integer)
Hero_Name=Column(String)
Hero_Full_Name=Column(String)
Hero_Genger=Column(String)
Hero_Place_Of_Birth=Column(String)
Hero_work=Column(String)
Hero_Images=Column(String)
Base.metadata.tables
# Transform premise DataFrame
csv_file = "movie_actor_hero.csv"
premise_data_df = pd.read_csv(csv_file)
premise_data_df.head()
# Creating a DataBase
engine=psycopg2.connect(f'postgresql://postgres:postgres@localhost:5432')
engine.autocommit=True
cursor=engine.cursor()
sql='''Create database best_50_movies_heroes_db''';
cursor.execute(sql)
# Create database connection
engine=create_engine(f'postgresql://postgres:postgres@localhost:5432/best_50_movies_heroes_db')
# Confirm tables
Base.metadata.create_all(engine)
engine.table_names()
# Load DataFrames into database
premise_data_df.to_sql(name='movie_actor_hero', con=engine, if_exists='append', index=False)
# Confirm data has been added by querying the movie_actor_hero table
pd.read_sql_query('select * from movie_actor_hero', con=engine).head()
# format jupyter notebook file to HTML file
from nbconvert import HTMLExporter
import codecs
import nbformat
notebook_name = 'ETL-Project.ipynb'
output_file_name = 'output.html'
exporter = HTMLExporter()
output_notebook = nbformat.read(notebook_name, as_version=4)
output, resources = exporter.from_notebook_node(output_notebook)
codecs.open(output_file_name, 'w', encoding='utf-8').write(output)